
##################################################################
##################################################################
## Replication Material
## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 
## Models to Measure Discrete Emotions in German Political Text
## Political Analysis
## tobias.widmann@eui.eu
##
## Script 04: Training of Word Embeddings
##################################################################
##################################################################

# Note: The file 000_readme.pdf describes all scripts and datasets required to replicate the analysis

# This script was run on the following R version, platform and OS:
# R version 4.0.5 (2021-03-31)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Big Sur 11.5.1

sessionInfo()

#### Set Working Directory to the Replication Folder# ###########################

# Delete hashtag below and fill in the directory of the replication folder
#setwd("")


#### Load Packages ##############################################################

library(keras)        # Version 2.6.0
library(quanteda)     # Version 3.0.0
library(corpus)       # Version 0.10.1
library(devtools)     # Version 2.4.0
#install_github("mukul13/rword2vec")
library(rword2vec)    # Version 1.1
library(readr)        # Version 1.4.0
library(stringr)      # Version 1.4.0


#### Load Data #################################################################
### Transformation Data
# To train word embeddings locally,
# we first have to load the transformation data (~ 2 million German documents of political text)
load("./ger_corpus.Rdata")

# Pre-process the transformation data
ger_corpus$sent.text <- tolower(ger_corpus$text) # Everything to lower case
ger_corpus$sent.text <- str_trim(ger_corpus$sent.text, side = "both")
ger_corpus$sent.text <- gsub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", ger_corpus$sent.text, fixed = FALSE) # Remove URL Links
ger_corpus$sent.text <- paste(" ", ger_corpus$sent.text, " ", sep="") # Add white space in the beginning and end
ger_corpus$sent.text <- gsub("@\\w+ *", "", ger_corpus$sent.text)
ger_corpus$sent.text <- gsub("#\\w+ *", "", ger_corpus$sent.text)
ger_corpus$sent.text <- gsub('[[:digit:]]+', '', ger_corpus$sent.text)
ger_corpus$sent.text <- gsub("[[:punct:]]", " ", ger_corpus$sent.text, fixed = FALSE) # Remove punctuation
ger_corpus$sent.text <- gsub("  ", " ", ger_corpus$sent.text, fixed = TRUE) # Remove doubled whitespaces

# Convert to character
text.gcorpus <- ger_corpus$sent.text

writeLines(text.gcorpus, "text_ed_preprocessed")

# Train the word embeddings model
model <- word2vec(
  train_file = "text_ed_preprocessed",
  output_file = "vec_ed_preprocessed.bin",
  binary=1,
  debug_mode = 1)

# Now, you can use the trained word embeddings to find synonyms for words
distance(file_name = "vec_ed_preprocessed.bin",
         search_word = "deutschland", # Germany
         num = 10)

distance(file_name = "vec_ed_preprocessed.bin",
         search_word = "freude", # joy
         num = 10)

distance(file_name = "vec_ed_preprocessed.bin",
         search_word = "eklig", # disgusting
         num = 10)


# Word vectors can be extracted and saved as txt-file
# bin_to_txt("vec_ed_preprocessed.bin", "vec_ed_preprocessed.txt")

embeddings <- read_delim("vec_ed_preprocessed.txt", 
                   skip=1, delim=" ",
                   col_names=c("word", paste0("V", 1:100)))





#################################################################################
### END OF SCRIPT ###
#################################################################################



